UCI has a number of datasets related to machine learning. We will leverage the Bank Marketing dataset. Look into this link for more information https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Load the train and test datasets
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)
In [3]:
# Load the train dataset
train = pd.read_csv("../Data/train.csv")
In [4]:
# Load the test dataset
test = pd.read_csv("../Data/test.csv")
In [5]:
# View the first 5 records of train
train.head()
Out[5]:
In [6]:
# View the last 10 records of test
test.tail(10)
Out[6]:
In [7]:
# List the attributes/feature names/columns in train dataset
train.columns
Out[7]:
In [8]:
# List the attributes in test dataset.
test.columns
Out[8]:
In [9]:
type(test.columns)
Out[9]:
In [10]:
train.columns.values
Out[10]:
In [11]:
test.columns.values
Out[11]:
In [12]:
# Do they match with train?
[x in test.columns.values for x in train.columns.values]
Out[12]:
y - has the client subscribed a term deposit? (binary: 'yes','no')
In [13]:
train.dtypes
Out[13]:
In [14]:
# Find unique values in deposit for train dataset
pd.unique(train.deposit)
Out[14]:
In [15]:
# Find unique values in deposit for test dataset. Are they the same?
pd.unique(test.deposit)
Out[15]:
In [16]:
pd.unique(test['month'])
Out[16]:
In [17]:
# Find frequency of deposit in train dataset
train.deposit.value_counts()
Out[17]:
In [18]:
# Find frequency of deposit in test dataset
test.deposit.value_counts()
Out[18]:
In [19]:
type(train.deposit.value_counts())
Out[19]:
In [20]:
# Is the distribution of deposit similar in train and test?
print("train:",train.deposit.value_counts()[1]/train.shape[0]*100)
print("test:",test.deposit.value_counts()[1]/test.shape[0]*100)
In [21]:
# Find number of rows and columns in train
train.shape
Out[21]:
In [22]:
# Find number of rows and columns in test
test.shape
Out[22]:
In [23]:
train.describe()
Out[23]:
Where did the remaining columns go?
describe only works for continuous variable and not categorical variables
In [24]:
# Create labels: that has 0 for No and 1 for Yes in train dataset
labels = np.where(train.deposit=="no", 0, 1)
In [25]:
# Display number of 0 and 1 - check if it is the same as what we saw above?
np.unique(labels, return_counts=True)
Out[25]:
In [26]:
train.head()
Out[26]:
In [27]:
train.loc[:,['deposit','age']]
Out[27]:
In [28]:
bivariate_plot_deposit_age = train.loc[:,["deposit", "age"]].copy()
In [29]:
bivariate_plot_deposit_age.head()
Out[29]:
In [30]:
bivariate_plot_deposit_age.age.hist()
Out[30]:
In [31]:
sns.stripplot(x="deposit", y = "age", data = bivariate_plot_deposit_age,
jitter = True, alpha = 0.1)
Out[31]:
In [32]:
train.plot(kind="scatter", x = 'age', y = 'pdays', color = labels, alpha = 0.5, s=50)
Out[32]:
In [33]:
train.plot(kind="scatter", x = 'day', y = 'duration', color = labels,
alpha = 0.5, s=50)
Out[33]:
In [34]:
import sklearn
from sklearn import preprocessing
In [35]:
# Find the columns that are categorical
train.select_dtypes(include=['object'])
Out[35]:
In [36]:
train_to_convert = train.select_dtypes(include=["object_"]).copy()
test_to_convert = test.select_dtypes(include=["object_"]).copy()
In [37]:
train_np = np.array(train_to_convert)
test_np = np.array(test_to_convert)
In [38]:
for i in range(train_np.shape[1]):
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_np[:, i]))
train_np[:,i] = lbl.transform(train_np[:, i])
test_np[:,i] = lbl.transform(test_np[:, i])
In [39]:
# Display train_np
train_np
Out[39]:
In [40]:
# How would you transform test?
test_np
Out[40]:
In [41]:
train_np
Out[41]:
In [42]:
test_np
Out[42]:
In [43]:
# Now, merge the numeric and encoded train variables into one single dataset
In [44]:
train_numeric = np.array(train.select_dtypes(exclude=["object_"]).copy())
In [45]:
train_numeric.shape
Out[45]:
In [46]:
train_encoded = np.concatenate([train_numeric, train_np], axis=1)
In [47]:
# Now, merge the numeric and encoded test variables into one single dataset
test_numeric = np.array(test.select_dtypes(exclude=["object_"]).copy())
In [48]:
test_encoded = np.concatenate([test_numeric, test_np], axis=1)
In [49]:
# Create train X and train Y
In [50]:
xlen = train_encoded.shape[1]-1
In [51]:
train_encoded_X = train_encoded[:, :xlen]
In [52]:
train_encoded_Y = np.array(train_encoded[:, -1], dtype=float)
In [53]:
train_encoded_Y
Out[53]:
In [54]:
# Create test X
test_encoded_X = test_encoded[:, :xlen]
In [55]:
# Create test Y
test_encoded_Y = np.array(test_encoded[:, -1], dtype=float)
In [56]:
model_allzero = test_encoded_Y.copy()
In [57]:
model_allzero = 0
In [58]:
# The mean square error on AllZero model
print("Mean Squared Error on all zero model: %.2f"
% (np.mean((model_allzero - test_encoded_Y) ** 2)*100))
Y = β0 + β1X1 + β2X2 + … + βn*Xn
In [59]:
from sklearn import linear_model
In [60]:
model_linear = linear_model.LinearRegression()
In [61]:
model_linear.fit(train_encoded_X, train_encoded_Y)
Out[61]:
In [62]:
# The coefficients
print('Coefficients: \n', model_linear.coef_)
In [63]:
# Prediction
model_linear_prediction = model_linear.predict(test_encoded_X)
In [64]:
model_linear_prediction
Out[64]:
In [65]:
model_linear_prediction = np.where(model_linear_prediction>0.5, 1, 0)
In [66]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_linear.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [67]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_linear.predict(test_encoded_X) - test_encoded_Y) ** 2)*100))
In [68]:
model_logistic_L2 = linear_model.LogisticRegression()
In [69]:
model_logistic_L2.fit(train_encoded_X, train_encoded_Y)
Out[69]:
In [70]:
# The coefficients
print('Coefficients: \n', model_logistic_L2.coef_)
In [71]:
# Prediction
model_logistic_L2_prediction = model_logistic_L2.predict(test_encoded_X)
In [72]:
np.unique(model_logistic_L2_prediction)
Out[72]:
In [73]:
np.sum(model_logistic_L2_prediction)
Out[73]:
In [74]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_logistic_L2.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [75]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_logistic_L2_prediction - test_encoded_Y) ** 2)*100))
In [76]:
# Code here. Report evaulation
model_logistic_L1 = linear_model.LogisticRegression(penalty = 'l1')
In [77]:
model_logistic_L1.fit(train_encoded_X, train_encoded_Y)
Out[77]:
In [78]:
# The coefficients
print('Coefficients: \n', model_logistic_L1.coef_)
In [79]:
# Prediction
model_logistic_L1_prediction = model_logistic_L1.predict(test_encoded_X)
In [80]:
np.unique(model_logistic_L1_prediction)
Out[80]:
In [81]:
np.sum(model_logistic_L1_prediction)
Out[81]:
In [82]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_logistic_L1.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [83]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_logistic_L1_prediction - test_encoded_Y) ** 2)*100))
In [84]:
model_logistic_L2C = linear_model.LogisticRegression(C = 2)
In [85]:
model_logistic_L2C.fit(train_encoded_X, train_encoded_Y)
Out[85]:
In [86]:
# The coefficients
print('Coefficients: \n', model_logistic_L2C.coef_)
In [87]:
# Prediction
model_logistic_L2C_prediction = model_logistic_L2C.predict(test_encoded_X)
In [88]:
np.unique(model_logistic_L2C_prediction)
Out[88]:
In [89]:
np.sum(model_logistic_L2C_prediction)
Out[89]:
In [90]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_logistic_L2C.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [91]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_logistic_L2C_prediction - test_encoded_Y) ** 2)*100))
In [107]:
from sklearn import tree
from sklearn.externals.six import StringIO
# import pydot
In [108]:
model_DT = tree.DecisionTreeClassifier()
In [109]:
#Let's use only the first two columns as features for the model
In [110]:
model_DT.fit(train_encoded_X[:,1:3], train_encoded_Y)
Out[110]:
In [111]:
# dot_data = StringIO()
# tree.export_graphviz(model_DT, out_file=dot_data)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("dt1.pdf")
In [98]:
# Prediction
model_DT_prediction = model_DT.predict(test_encoded_X[:,1:3])
In [99]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_DT.predict(train_encoded_X[:,1:3]) - train_encoded_Y) ** 2)*100))
In [100]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_DT_prediction - test_encoded_Y) ** 2)*100))
In [113]:
model_DTAll = tree.DecisionTreeClassifier()
In [117]:
model_DTAll.fit(train_encoded_X, train_encoded_Y)
Out[117]:
In [118]:
# Prediction
model_DTAll_prediction = model_DTAll.predict(test_encoded_X)
In [119]:
# The mean square error on train
print("Mean Squared Error on train: %.2f"
% (np.mean((model_DTAll.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [120]:
# The mean square error on test
print("Mean Squared Error on test: %.2f"
% (np.mean((model_DTAll_prediction - test_encoded_Y) ** 2)*100))
In [101]:
from sklearn.ensemble import RandomForestClassifier
In [ ]:
?RandomForestClassifier
In [102]:
model_RF = RandomForestClassifier()
In [103]:
model_RF.fit(train_encoded_X, train_encoded_Y)
Out[103]:
In [104]:
# Prediction
model_RF_prediction = model_RF.predict(test_encoded_X)
In [105]:
# The mean square error on train
print("Mean Percentage Error on train: %.2f"
% (np.mean((model_RF.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [106]:
# The mean square error on test
print("Mean Percentage Error on test: %.2f"
% (np.mean((model_RF_prediction - test_encoded_Y) ** 2)*100))
In [121]:
?RandomForestClassifier
In [122]:
model_RFMod = RandomForestClassifier(max_depth = 8, oob_score = True, n_estimators = 400 )
In [123]:
model_RFMod.fit(train_encoded_X, train_encoded_Y)
Out[123]:
In [124]:
# Prediction
model_RFMod_prediction = model_RFMod.predict(test_encoded_X)
In [125]:
# The mean square error on train
print("Mean Percentage Error on train: %.2f"
% (np.mean((model_RFMod.predict(train_encoded_X) - train_encoded_Y) ** 2)*100))
In [126]:
# The mean square error on test
print("Mean Percentage Error on test: %.2f"
% (np.mean((model_RFMod_prediction - test_encoded_Y) ** 2)*100))
In [127]:
from sklearn.cross_validation import StratifiedKFold
In [ ]:
?StratifiedKFold
In [128]:
skf = StratifiedKFold(train_encoded_Y, 5, random_state=1131, shuffle=True)
In [129]:
for train, test in skf:
print("%s %s" % (train, test))
print(train.shape, test.shape)
In [130]:
model_RF = RandomForestClassifier()
In [131]:
for k, (train, test) in enumerate(skf):
model_RF.fit(train_encoded_X[train], train_encoded_Y[train])
print("fold:", k+1, model_RF.score(train_encoded_X[test], train_encoded_Y[test]))
In [158]:
cv_error = []
for k, (train, test) in enumerate(skf):
model_RF.fit(train_encoded_X[train], train_encoded_Y[train])
print(k)
cv_error.append(np.mean((model_RFMod.predict(train_encoded_X) - train_encoded_Y) ** 2)*100)
In [159]:
cv_error
Out[159]:
In [163]:
# code here
np.mean(cv_error)
Out[163]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [132]:
train = pd.read_csv("../Data/train.csv")
test = pd.read_csv("../Data/test.csv")
In [133]:
train_one_hot = pd.get_dummies(train)
In [134]:
train_one_hot.head()
Out[134]:
In [135]:
test_one_hot = pd.get_dummies(test)
test_one_hot.head()
Out[135]:
In [136]:
# Check if columns are the same
In [149]:
[x in test_one_hot.columns.values for x in train_one_hot.columns.values]
Out[149]:
In [137]:
#Create train X , train Y, test X , test Y
In [138]:
train_X = train_one_hot.ix[:,:train_one_hot.shape[1]-2 ]
In [139]:
train_X.columns
Out[139]:
In [140]:
train_Y = train_one_hot.ix[:, -1]
In [141]:
train_Y.head()
Out[141]:
In [142]:
test_X = test_one_hot.ix[:,:test_one_hot.shape[1]-2 ]
test_Y = test_one_hot.ix[:, -1]
In [143]:
# Run Random Forest and check accuracy
In [144]:
model_RF = RandomForestClassifier(n_estimators=400, max_depth=8, oob_score=True, n_jobs=-1)
In [145]:
model_RF.fit(train_X, train_Y)
Out[145]:
In [146]:
# Prediction
model_RF_prediction = model_RF.predict(test_X)
In [147]:
# The mean square error on train
print("Mean Percentage Error on train: %.2f"
% (np.mean((model_RF.predict(train_X) - train_Y) ** 2)*100))
In [148]:
# The mean square error on test
print("Mean Percentage Error on test: %.2f"
% (np.mean((model_RF_prediction - test_Y) ** 2)*100))